packages <- c("CIMseq", "CIMseq.data", "tidyverse", "scSeqTools", "printr")
purrr::walk(packages, library, character.only = TRUE)
rm(packages)

##DATA
load('../data/CIMseqData.rda')
load('../data/sObj.rda')
#rename classes
renameClasses <- function(class) {
  case_when(
    class == "0" ~ "C.Stem.proximal",
    class == "1" ~ "C.Stem.distal",
    class == "2" ~ "SI.Stem",
    class == "3" ~ "C.Goblet.distal",
    class == "4" ~ "SI.Enterocytes",
    class == "5" ~ "SI.Goblet",
    class == "6" ~ "C.Goblet.proximal",
    class == "7" ~ "Enteroendocrine",
    class == "8" ~ "Tufft",
    class == "9" ~ "SI.Paneth",
    class == "10" ~ "Blood",
    TRUE ~ "error"
  )
}

cOrder <- c(
  "C.Stem.proximal", "C.Goblet.proximal", "C.Stem.distal", "C.Goblet.distal",
  "SI.Goblet", "SI.Paneth", "SI.Stem", "SI.Enterocytes",
  "Enteroendocrine", "Tufft", "Blood"
)

getData(cObjSng, "classification") <- renameClasses(getData(cObjSng, "classification"))
features <- getData(cObjMul, "features")
names(features) <- renameClasses(names(features))
getData(cObjMul, "features") <- features
fractions <- getData(sObj, "fractions")
colnames(fractions) <- renameClasses(colnames(fractions))
sObj@fractions <- fractions

Classification

plotUnsupervisedClass(cObjSng, cObjMul)

plotUnsupervisedMarkers(
  cObjSng, cObjMul,
  c("Lgr5", "Ptprc", "Chga", "Dclk1", "Alpi", "Slc26a3", "Atoh1", "Lyz1"),
  pal = RColorBrewer::brewer.pal(8, "Set1")
)

plotUnsupervisedMarkers(
  cObjSng, cObjMul, "Hoxb13",
  pal = RColorBrewer::brewer.pal(8, "Set1")
)

plotUnsupervisedMarkers(
  cObjSng, cObjMul, "Mki67",
  pal = RColorBrewer::brewer.pal(8, "Set1")
)

classHeatmap(
  data = data.frame(
    gene = rownames(getData(cObjMul, "counts"))[getData(cObjMul, "features")],
    class = names(getData(cObjMul, "features"))
  ),
  counts.log = getData(cObjSng, "counts.log"),
  classes = tibble(
    sample = colnames(getData(cObjSng, "counts")), 
    class = getData(cObjSng, "classification")
  )
)

Show top 10 (by fold change) features per class.

violinPlot <- function(cpm, genes, classes, class) {
  cpm[genes, ] %>%
    t() %>% 
    matrix_to_tibble("sample") %>%
    gather(gene, cpm, -sample) %>%
    full_join(tibble(sample = colnames(cpm), class = classes), by = "sample") %>%
    ggplot() +
    geom_jitter(aes(class, cpm), height = 0, size = 0.1, alpha = 0.25) +
    facet_wrap(~gene, scales = "free_y") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90)) +
    labs(title = paste0(class, " genes"))
}


getGenes <- function(
  cpm, classes, features, class, ntop = 10
) {
  fc <- foldChangePerClass(
    cpm[features[names(features) == class], ], 
    tibble(sample = colnames(cpm), class = classes)
  )
  rownames(fc[order(fc[, class], decreasing = TRUE), ])[1:ntop]
}

cpm <- getData(cObjSng, "counts.cpm")
c <- "C.Stem.distal"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)

c <- "C.Stem.proximal"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)

c <- "C.Goblet.distal"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)

c <- "C.Goblet.proximal"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)

c <- "SI.Stem"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)

c <- "SI.Goblet"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)

c <- "SI.Enterocytes"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)

c <- "SI.Paneth"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)

Deconvolution

plotSwarmCircos(sObj, cObjSng, cObjMul, weightCut = 0, classOrder = cOrder)

Filtering

Only detected duplicates and triplicates.
Only ERCC estimated cell number <= 4.
Weight cutoff = 5.

adj <- adjustFractions(cObjSng, cObjMul, sObj, binary = TRUE)
samples <- rownames(adj)
rs <- rowSums(adj)
keep <- rs == 2 | rs == 3

plotSwarmCircos(
  filterSwarm(sObj, keep), cObjSng, cObjMul, weightCut = 5, 
  classOrder = cOrder, theoretical.max = 4
)

False positive stats

efm <- getEdgesForMultiplet(sObj, cObjSng, cObjMul, theoretical.max = 4) %>% 
  mutate(conn = map2(from, to, ~tibble(
    from = sort(c(.x, .y))[1], to = sort(c(.x, .y))[2]
  ))) %>% 
  select(-from, -to) %>% 
  unnest() %>% 
  distinct()

# fp <- efm %>%
#   mutate(fp = case_when(
#     str_detect(from, "^SI") & str_detect(to, "^C") ~ TRUE,
#     str_detect(to, "^SI") & str_detect(from, "^C") ~ TRUE,
#     str_detect(from, "proximal") & str_detect(to, "distal") ~ TRUE,
#     str_detect(to, "proximal") & str_detect(from, "distal") ~ TRUE,
#     TRUE ~ FALSE
#   )) %>%
#   group_by(sample) %>%
#   summarize(fp = sum(fp)) %>%
#   {setNames(pull(., fp), pull(., sample))}

fp <- efm %>%
  inner_join(select(MGA.Meta, sample, sub_tissue)) %>%
  mutate(fp = case_when(
    (str_detect(from, "^C") | str_detect(to, "^C")) & sub_tissue == "small_intestine" ~ TRUE,
    (str_detect(from, "^SI") | str_detect(to, "^SI")) & sub_tissue == "colon" ~ TRUE,
    TRUE ~ FALSE
  )) %>%
  group_by(sample) %>%
  summarize(fp = sum(fp)) %>%
  {setNames(pull(., fp), pull(., sample))}
## Joining, by = "sample"

Detected 461 false positive connections out of 1354 total connections. Of those multiplets with a detected connection, 284 multiplets have at least one false positive out of 1177 total multiplets. The per multiplet false positive distribution is shown in the plot below.

ggplot() +
  geom_bar(aes(fp)) +
  theme_bw() +
  labs(x = "False positives per multiplet") +
  scale_x_continuous(breaks = 0:max(fp))

The type of false positive connections and their amount are shown below.

typeFreq <- efm %>%
  mutate(fp = case_when(
    str_detect(from, "^SI") & str_detect(to, "^C") ~ TRUE,
    str_detect(to, "^SI") & str_detect(from, "^C") ~ TRUE,
    str_detect(from, "proximal") & str_detect(to, "distal") ~ TRUE,
    str_detect(to, "proximal") & str_detect(from, "distal") ~ TRUE,
    TRUE ~ FALSE
  )) %>%
  filter(fp) %>%
  count(from, to) %>%
  arrange(desc(n)) %>%
  as.data.frame()

typeFreq
from to n
C.Stem.distal C.Stem.proximal 288
C.Goblet.distal C.Stem.proximal 223
C.Stem.proximal SI.Stem 147
C.Goblet.distal C.Goblet.proximal 111
C.Goblet.proximal C.Stem.distal 90
C.Stem.proximal SI.Goblet 75
C.Stem.distal SI.Enterocytes 47
C.Stem.distal SI.Goblet 41
C.Goblet.distal SI.Goblet 30
C.Goblet.proximal SI.Goblet 25
C.Stem.distal SI.Stem 24
C.Stem.proximal SI.Enterocytes 24
C.Goblet.distal SI.Stem 11
C.Goblet.distal SI.Enterocytes 9
C.Stem.proximal SI.Paneth 7
C.Goblet.proximal SI.Stem 5
C.Goblet.distal SI.Paneth 1
C.Goblet.proximal SI.Enterocytes 1
#number of features corresponding to class
#total sum of counts for features corresponding to class
# features <- getData(cObjMul, "features")
# cpm <- getData(cObjSng, "counts.cpm")
# classifications <- getData(cObjSng, "classification")
# rs <- rowSums(cpm)
# table(names(features))
# 
# geneDat <- tibble(class = names(features), gene = rownames(getData(cObjSng, "counts"))[features]) %>%
#   mutate(all = rs[gene]) %>%
#   mutate(classOnly = map2_dbl(class, gene, function(c, g) {
#     sum(cpm[g, classifications == c])
#   })) %>% 
#   group_by(class) %>%
#   summarize(sum.all = sum(all), sum.classOnly = sum(classOnly)) 
# 
# inner_join(typeFreq, geneDat, by = c("from" = "class")) %>%
#   inner_join(geneDat, by = c("to" = "class")) %>%
#   mutate(
#     sum.all = map2_dbl(sum.all.x, sum.all.y, ~sum(c(.x, .y))),
#     sum.classOnly = map2_dbl(sum.classOnly.x, sum.classOnly.y, ~sum(c(.x, .y)))
#   ) %>%
#   ggplot() +
# geom_point(aes(sum.all, n))
dc <- rowSums(adjustFractions(cObjSng, cObjMul, sObj))
tibble(
  sample = names(fp),
  `False positives (n)` = fp,
  `Deconvolution detected connections (n)` = dc[names(fp)]
) %>% 
  group_by(`Deconvolution detected connections (n)`) %>%
  summarize(`False positives (n)` = sum(`False positives (n)`)) %>%
  as.data.frame()
Deconvolution detected connections (n) False positives (n)
2 140
3 198
4 103
5 20
6 0

Correlation between various parameters and the number of false positives is shown below:

mulNames <- names(fp) #note other multiplets have 0 connections

ec <- estimateCells(cObjSng, cObjMul, warn = FALSE) %>%
  filter(sample %in% mulNames) %>%
  {setNames(pull(., estimatedCellNumber), pull(., sample))}
ec <- ec[mulNames]
dec <- rowSums(adjustFractions(cObjSng, cObjMul, sObj, binary = TRUE))
dec <- dec[mulNames]
c <- getData(sObj, "costs")
c <- c[mulNames]
tc <- colSums(getData(cObjMul, "counts"))
tc <- tc[mulNames]
dg <- apply(getData(cObjMul, "counts"), 2, function(c) length(which(c != 0)))
dg <- dg[mulNames]
feat.do <- getData(cObjMul, "counts.cpm")[features, ] %>%
  matrix_to_tibble("gene") %>%
  gather(sample, cpm, -gene) %>%
  group_by(sample) %>%
  summarize(do = length(which(cpm == 0))) %>%
  {setNames(pull(., do), pull(., sample))}
feat.do <- feat.do[mulNames]

cormat <- cor(matrix(c(fp, ec, dec, c, tc, dg, feat.do), ncol = 7))
corvec <- cormat[2:nrow(cormat), 1]
names(corvec) <- c(
  "ERCC estimated cell number", "Deconvolution estimated cell number", 
  "Deconvolution cost", "Total counts", "Total detected genes", 
  "Drop-outs in features"
)
data.frame(cor = corvec)
cor
ERCC estimated cell number 0.3647011
Deconvolution estimated cell number 0.4474336
Deconvolution cost 0.1633668
Total counts 0.2323673
Total detected genes 0.2731302
Drop-outs in features -0.2272327
cpm <- getData(cObjMul, "counts.cpm")
features <- getData(cObjMul, "features")
cpm[features, ] %>%
  matrix_to_tibble("gene") %>%
  gather(sample, cpm, -gene) %>%
  inner_join(tibble(class = names(features), gene = rownames(cpm)[features])) %>%
  group_by(sample, class) %>%
  summarize(cpm.sum = sum(cpm)) %>%
  ungroup() %>%
  inner_join(tibble(sample = names(fp), fp = fp)) %>%
  ggplot() +
  geom_point(aes(cpm.sum, fp)) +
  facet_wrap(~class)
#multiplets with high fp look to have low expression of the selected features

Distribution of fractions in connections thought to be true vs connections known to be false.

fractions <- getData(sObj, "fractions")
efm %>%
  mutate(fp = case_when(
    str_detect(from, "^SI") & str_detect(to, "^C") ~ TRUE,
    str_detect(to, "^SI") & str_detect(from, "^C") ~ TRUE,
    str_detect(from, "proximal") & str_detect(to, "distal") ~ TRUE,
    str_detect(to, "proximal") & str_detect(from, "distal") ~ TRUE,
    TRUE ~ FALSE
  )) %>%
  mutate(fracs = pmap(list(sample, from, to), function(s, f, t) {
    fractions[s, c(f, t)]
  })) %>%
  unnest() %>%
  ggplot() +
  geom_boxplot(aes(fp, fracs)) +
  labs(x = "False positive", y = "Deconvolution fraction") +
  theme_bw()

Cluster multiplets and look for an overrepresentaion of false positives.

p.dist <- CIMseq::pearsonsDist(getData(cObjMul, "counts.cpm"), CIMseq::selectTopMax(getData(cObjMul, "counts.cpm"), 2000))
set.seed(234899)
u <- uwot::umap(p.dist)
rownames(u) <- colnames(getData(cObjMul, "counts"))
u %>% 
  matrix_to_tibble("sample") %>% 
  inner_join(tibble(sample = names(fp), fp = fp)) %>% 
  mutate(fp.bool = fp > 0) %>% 
  ggplot() + 
  geom_point(aes(V1, V2, colour = fp.bool)) +
  theme_bw() +
  labs(x = "UMAP1", y = "UMAP2") +
  guides(colour = guide_legend(title = "Has false positive?"))
## Joining, by = "sample"

u %>% 
  matrix_to_tibble("sample") %>% 
  inner_join(tibble(sample = names(fp), fp = fp)) %>% 
  ggplot() + 
  geom_point(aes(V1, V2, colour = fp)) +
  theme_bw() +
  scale_colour_viridis_c() +
  labs(x = "UMAP1", y = "UMAP2") +
  guides(colour = guide_legend(title = "False positives (n)"))
## Joining, by = "sample"